# How soccer became a global sport: where did it start and what changed as more teams were starting to compete.
# Which countries have dominated the different eras of soccer since everything started.


# Cleaning, processing and first exploration

# As seen below, this data set consists of (supposedly) all games since the inaugural Scotland - England in 1872.
# For each game, we have the score, the tournament, the host city and country.

### Loading libraries
library(ggplot2) # Data visualization
## Warning: package 'ggplot2' was built under R version 4.2.2
library(readr) # CSV file I/O, e.g. the read_csv function
## Warning: package 'readr' was built under R version 4.2.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.2
# Reading input file.
df <- read_csv("C://Users//Nishtha//Documents//bhavuk//Semester 6//DV//J Comp//results.csv")
## Rows: 44353 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): home_team, away_team, tournament, city, country
## dbl  (2): home_score, away_score
## lgl  (1): neutral
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 9
##   date       home_team away_team home_sc…¹ away_…² tourn…³ city  country neutral
##   <date>     <chr>     <chr>         <dbl>   <dbl> <chr>   <chr> <chr>   <lgl>  
## 1 1872-11-30 Scotland  England           0       0 Friend… Glas… Scotla… FALSE  
## 2 1873-03-08 England   Scotland          4       2 Friend… Lond… England FALSE  
## 3 1874-03-07 Scotland  England           2       1 Friend… Glas… Scotla… FALSE  
## 4 1875-03-06 England   Scotland          2       2 Friend… Lond… England FALSE  
## 5 1876-03-04 Scotland  England           3       0 Friend… Glas… Scotla… FALSE  
## 6 1876-03-25 Scotland  Wales             4       0 Friend… Glas… Scotla… FALSE  
## # … with abbreviated variable names ¹​home_score, ²​away_score, ³​tournament
# Let's check if we hace some NA or NULL values we should clean.
# Apparently not. Good news, let's continue.
apply(df, 2, function(v) {length(which(is.na(v) | is.null(v)))})
##       date  home_team  away_team home_score away_score tournament       city 
##          0          0          0          0          0          0          0 
##    country    neutral 
##          0          0
# Let's process a bit the data so that we can have a quicker access to some important feature such as the result or the names of the winning or losing team. The outcome of a game will be encoded as D for draw, H for the home team winning and A for the away team winning. We will also extract some date-related features such as the day of week or month.
game_outcome <- function(home_score, away_score) {
  outcome <- "D"
  if (home_score > away_score) {outcome <- "H"}
  if (home_score < away_score) {outcome <- "A"}
  return(outcome)
}

winning_team <- function(home_score, away_score, home_team, away_team) {
  winning_team <- NA
  if (home_score > away_score) {winning_team <- home_team}
  if (home_score < away_score) {winning_team <- away_team}
  return(winning_team)
}

losing_team <- function(home_score, away_score, home_team, away_team) {
  losing_team <- NA
  if (home_score < away_score) {losing_team <- home_team}
  if (home_score > away_score) {losing_team <- away_team}
  return(losing_team)
}

df <- df %>%
  mutate(year = format(date, "%Y"),
         month = format(date, "%b"),
         dayofweek = weekdays(date)) %>%
  rowwise() %>%
  mutate(outcome = game_outcome(home_score, away_score),
         winning_team = winning_team(home_score, away_score, home_team, away_team),
         losing_team = losing_team(home_score, away_score, home_team, away_team)) %>%
  ungroup()


head(df)
## # A tibble: 6 × 15
##   date       home_…¹ away_…² home_…³ away_…⁴ tourn…⁵ city  country neutral year 
##   <date>     <chr>   <chr>     <dbl>   <dbl> <chr>   <chr> <chr>   <lgl>   <chr>
## 1 1872-11-30 Scotla… England       0       0 Friend… Glas… Scotla… FALSE   1872 
## 2 1873-03-08 England Scotla…       4       2 Friend… Lond… England FALSE   1873 
## 3 1874-03-07 Scotla… England       2       1 Friend… Glas… Scotla… FALSE   1874 
## 4 1875-03-06 England Scotla…       2       2 Friend… Lond… England FALSE   1875 
## 5 1876-03-04 Scotla… England       3       0 Friend… Glas… Scotla… FALSE   1876 
## 6 1876-03-25 Scotla… Wales         4       0 Friend… Glas… Scotla… FALSE   1876 
## # … with 5 more variables: month <chr>, dayofweek <chr>, outcome <chr>,
## #   winning_team <chr>, losing_team <chr>, and abbreviated variable names
## #   ¹​home_team, ²​away_team, ³​home_score, ⁴​away_score, ⁵​tournament
# Now, let's do some basic exploration. How many entries? Answer > 38k matches.
dim(df)
## [1] 44353    15
# A journey through the historical landscape of international soccer
# Which teams play the most?
# Let's start by checking which are the most represented teams? This will tell us which are the team with the richest history.
# Surprisingly, Sweden is the team who has played the most games. Most top 10 countries are major soccer nation such as Brazil, Argentina, England, Germany or France. Countries such as Ururguay, Mexico and Hungary are also old teams as they participated to the first world cups (1930 and/or 1934).

all_teams <- data.frame(teams = c(df$home_team, df$away_team), year=as.numeric(c(df$year, df$year)))

all_teams_count <- all_teams %>%
  group_by(teams) %>%
  summarise(number_games = length(teams)) %>%
  arrange(desc(number_games))

head(all_teams_count, 10)
## # A tibble: 10 × 2
##    teams       number_games
##    <chr>              <int>
##  1 Sweden              1053
##  2 England             1049
##  3 Brazil              1021
##  4 Argentina           1018
##  5 Germany              986
##  6 Hungary              966
##  7 Mexico               935
##  8 Uruguay              919
##  9 South Korea          905
## 10 France               880
# It is likely all these teams have a different trajectory, some might have start playing earlier and some later. The plot below displays the cumulative sum of the number of matches for these top 10 teams. Hover the line to display the name of the team. You can also click on a team's name to hide/show it.

top_teams_games_per_year <- all_teams %>%
  filter(teams %in% head(all_teams_count, 10)$teams & year < 2018) %>%
  group_by(teams, year) %>%
  summarise(nb_games = length(year)) %>%
  mutate(year_date=as.Date(paste(year,"-01-01",sep="")))
## `summarise()` has grouped output by 'teams'. You can override using the
## `.groups` argument.
library(plotly)

top_teams_games_per_year <- top_teams_games_per_year %>%
  arrange(teams, year) %>%
  group_by(teams) %>%
  mutate(cumsum=cumsum(nb_games))

p <- ggplot(top_teams_games_per_year, aes(x=year_date, y=cumsum, colour=teams, group=teams)) +
  geom_line() +
  labs(x="Year", y="Cumulated number of games", title="Top 10 teams in total number of games", colour="Click on a team \nto hide/show it")
ggplotly(p)
# The 10 most active teams indded have different trajectories. England gets its second positopm thanks to the many games they played in the 19th century. Some countries such as Sweden, France or Hungary have a more steady progression while teams like Korea or Mexico join the top 10 thanks to their recent hyper activity (Korea's first official games were just before 1950).
# How many games per year?
# Let's now check how many games were played each year and how the total number of international games evolve with time.
tmp <- df %>%
  filter(year < 2018) %>%
  mutate(year = as.numeric(year)) %>%
  group_by(year) %>%
  summarise(nb_games = length(date))  %>%
  ungroup()

ggplot(tmp, aes(x=year, y=nb_games, group=1)) +
  geom_line() +
  labs(x="Year", title="Number of international soccer games", y="") +
  scale_x_continuous(breaks=seq(1870, 2020, 10))

# There are few interestings things going on here:
# * Number of games is rising, with high growth in the 80s/90s.
# * It seems there is a peak around 2010, with a slight decrease since.
# * We see a drop during world wars.
# * Since the 80s, data is very spiky, likely due to the absence/presence of world cups or other events.
#
# Let's try to visualise this to add some understanding to our plot.

wc_years <- c(1930, 1934, 1938, seq(1950, 2014, 4))

tmp <- tmp %>%
  mutate(is_wc = year %in% wc_years)

ggplot(tmp, aes(x=year, y=nb_games, group=1)) +
  geom_line() +
  geom_point(data = tmp %>% filter(is_wc), aes(colour=is_wc)) +
  labs(x="Year", title="Number of international soccer games", y="", colour="World cup year") +
  geom_vline(xintercept=c(1914,1918,1939,1945), lwd=0.3, colour="gray80") +
  scale_x_continuous(breaks=seq(1870, 2020, 10))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# The two main drops indeed correspond to the 2 world wars but, surprisingly, the world cup years are those counting less matches.
# Let's investigate which are the most common game types and competitions every year, since 2000.

df_competitions <- df %>%
  group_by(tournament, year) %>%
  summarise(nb_games = length(date))
## `summarise()` has grouped output by 'tournament'. You can override using the
## `.groups` argument.
ggplot(df_competitions %>% filter(year >= 2000 & year < 2018),
       aes(x=year, y=nb_games, fill=tournament)) +
  geom_bar(stat="identity") +
  guides(fill=FALSE) +
  labs(x="Year", y="Number of games")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# We can see that some events/tournaments are more frequent on non-world cup years such as 2007 or 2011. Let's check what they are.
df_competitions %>% filter(year == 2011) %>% arrange(desc(nb_games))
## # A tibble: 22 × 3
## # Groups:   tournament [22]
##    tournament                           year  nb_games
##    <chr>                                <chr>    <int>
##  1 Friendly                             2011       379
##  2 FIFA World Cup qualification         2011       216
##  3 UEFA Euro qualification              2011       154
##  4 African Cup of Nations qualification 2011        77
##  5 AFC Asian Cup                        2011        32
##  6 AFC Challenge Cup qualification      2011        29
##  7 Island Games                         2011        29
##  8 Pacific Games                        2011        29
##  9 CECAFA Cup                           2011        26
## 10 Copa América                         2011        26
## # … with 12 more rows
df_competitions %>% filter(year == 2010) %>% arrange(desc(nb_games))
## # A tibble: 21 × 3
## # Groups:   tournament [21]
##    tournament                           year  nb_games
##    <chr>                                <chr>    <int>
##  1 Friendly                             2010       423
##  2 UEFA Euro qualification              2010        94
##  3 FIFA World Cup                       2010        64
##  4 African Cup of Nations qualification 2010        48
##  5 CFU Caribbean Cup qualification      2010        34
##  6 African Cup of Nations               2010        29
##  7 AFF Championship                     2010        24
##  8 AFC Asian Cup qualification          2010        19
##  9 CECAFA Cup                           2010        18
## 10 CFU Caribbean Cup                    2010        16
## # … with 11 more rows
# World cup qualifications generates much more matches than the world cup itself, which makes sense as the World Cup only concerns 32 countries. This is well shown in the two plost below: there is no WC qualification matches during a World Cup year and the number of qualification matches is greater than then number of WC matches by a factor 3 to 7 in general.

df_competition_filtered <- df_competitions %>% 
  filter(year >= 2006 & year < 2018 & tournament %in% c("Friendly","UEFA Euro qualification","FIFA World Cup", "FIFA World Cup qualification", "African Cup of Nations qualification")) 

ggplot(df_competition_filtered, aes(x=year, y=nb_games, group=tournament, colour=tournament)) +
  geom_point() +
  geom_line() +
  labs(x="Year", y="Nb games", colour="Competition")

# %% [code]
ggplot(df_competition_filtered, aes(x=year, y=nb_games, group=tournament, fill=tournament)) +
  geom_bar(stat="identity") +
  labs(x="Year", y="Nb games", fill="Competition")

# Worldwide soccer adoption
# When did soccer start to be widely played, i.e. when do most nations start playing international games? The plot below teaches us several things:
# 
# * The number of teams steadily increased 1902 and this increase accelerated up to 1920.
# * From there, the pace of addition of new teams increase much faster and stalls abit around the late 40's
# * Then we see a steady and rapid growth up to the mid 1990's.

df_teams_start <- all_teams %>%
  mutate(year = as.numeric(year)) %>%
  group_by(teams) %>%
  summarise(first_game = min(year))

df_year_teams_start <- df_teams_start %>%
  group_by(first_game) %>%
  summarise(n = length(teams)) %>%
  arrange(first_game) %>%
  mutate(cumsum = cumsum(n))

ggplot(df_year_teams_start, aes(x=first_game, y=cumsum)) +
  geom_line() +
  scale_x_continuous(breaks = seq(1870,2020, 10)) +
  labs(x="Year", title="Cumulative sum of number of international soccer teams", y="")

# Which were the first and last teams to join?

# The four first teams to compete in international games were from what is now forming UK. Soccer then crossed the pond and teams such as Canada, USA, Argentina or Uruguay joined the party. In the same time, central European countries such as Austria and Hungary also join the internation arena.

# Amongst the late joiners we mostly find tiny countries (Vatican or Comoros) and recent ones (Kosovo or South Sudan). We also find Caribean or northern american islands such as which aren;t countries but collectivies or municipalities of countries such as France or Netherlands. ALthough they are not nations, they competed against other countries either in friendly games or in local tournaments.
df_teams_start %>%
  arrange(first_game) %>%
  head(10)
## # A tibble: 10 × 2
##    teams            first_game
##    <chr>                 <dbl>
##  1 England                1872
##  2 Scotland               1872
##  3 Wales                  1876
##  4 Northern Ireland       1882
##  5 Canada                 1885
##  6 United States          1885
##  7 Argentina              1902
##  8 Austria                1902
##  9 Hungary                1902
## 10 Uruguay                1902
df_teams_start %>%
  arrange(first_game) %>%
  tail(10)
## # A tibble: 10 × 2
##    teams             first_game
##    <chr>                  <dbl>
##  1 Surrey                  2018
##  2 Yorkshire               2018
##  3 Chameria                2019
##  4 Saint Helena            2019
##  5 Aymara                  2022
##  6 Biafra                  2022
##  7 Brunei Darussalam       2022
##  8 Mapuche                 2022
##  9 Maule Sur               2022
## 10 Yoruba Nation           2022
# We have seen how different teams and continent started to compete one after the others. Let's now see what did this imply for the game itself and its organisation.
# When do games occur?
# Interstingly, the very first games mostly occur on Saturdays but a decent number also took place on Mondays! No game occurred on a Sunday until 1900, potentially for religious purposes but, around the 1910's Sunday was the most common day of the week to see an international game. Other week days, from Tuesday to Friday, weren't an option until later (as late as 1910 for Fridays).
# 
# The proportion of games happenning on a given day then changed quite a lot. Wednesdays games became very common and around 30% of the games happened on this day around the year 2000. More recently days such as Tuesday, Thursday or Friday also became more popular.
df_games_per_dayofweek <- df %>%
  mutate(year = as.numeric(year)) %>%
  filter(year < 2018) %>%
  group_by(year, dayofweek) %>%
  summarise(n = length(date)) %>%
  group_by(year) %>%
  mutate(perc = n / sum(n) * 100) %>%
  mutate(dayofweek = factor(dayofweek, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_games_per_dayofweek, aes(x=year, y=perc, colour=dayofweek, group=dayofweek)) +
  geom_line() +
  facet_wrap(~dayofweek) +
  labs(x="Year", y="Percentage of games played") +
  guides(colour=FALSE) +
  scale_x_continuous(breaks = seq(1870, 2020, 20)) +
  scale_y_continuous(breaks = seq(0,100, 10)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Now that we have looked at days, let's check whether some months are more popular for soccer games. The first games mostly occur during Spring months and since then, some month have known some peaks of popularity for intenational games at different period (e.g. many games happened in December in the 1940s).
# In a more recent history, international games became less common in May but more in June.
df_games_per_month <- df %>%
  mutate(year = as.numeric(year)) %>%
  filter(year < 2018) %>%
  group_by(year, month) %>%
  summarise(n = length(date)) %>%
  group_by(year) %>%
  mutate(perc = n / sum(n) * 100) %>%
  mutate(month = factor(month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_games_per_month, aes(x=year, y=perc, colour=month, group=month)) +
  geom_line() +
  facet_wrap(~month) +
  labs(x="Year", y="Percentage of games played") +
  guides(colour=FALSE) +
  scale_x_continuous(breaks = seq(1870, 2020, 20)) +
  scale_y_continuous(breaks = seq(0,100, 10)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Evolution of results
 
# Let' know talk about sport and  actual results! First let's check how the proportion of draws and home/away victories evolve through time. Main learnings are:
# * A victory of the home-based team has always been the most likely event.
# * A victory of the visitors is the second most likely outcome, although it tends to decrease in the second half of the 20th century.
# * A draw has always been the least likely outcome, altough it has increased in share since the 1940's.
 
# It is to be noted that the "home" team isn't always playing on his own country, as for example during world or continental cups.
df_outcome_per_year <- df %>%
  mutate(year = as.numeric(year)) %>%
  group_by(year, outcome) %>%
  summarise(n = length(year)) %>%
  group_by(year) %>%
  mutate(total_year = sum(n),
         perc = n / total_year * 100)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_outcome_per_year %>% filter(year > 1900 & year < 2018), aes(x=year, y=perc, group=outcome, colour=outcome)) +
  geom_line() +
  labs(x="Year", y="Percentage of games", colour="Outcome") +
  geom_smooth(se=FALSE, method="loess") +
  scale_x_continuous(breaks = seq(1870, 2020, 20))
## `geom_smooth()` using formula = 'y ~ x'

# Let's now get to what is at the heart of soccer: goals! How did this evolve with time?

# Although it started low (the first game resulted in a 0-0 between Scotland and England), then number of goals per games quickly skyrocketed and, before 1900, the average number of goals per game per year could be as high as 8!
# This average then stabilized around 4 until 1950 and then decreased down to 2.5 in a more modern era. The 80's has been the period were games delivered the lowest number of goals.
df_goals_per_game <- df %>%
  mutate(year = as.numeric(year)) %>%
  group_by(year) %>%
  summarise(nb_games = length(year),
            nb_goals = sum(home_score + away_score),
            goals_per_game = nb_goals / nb_games)

ggplot(df_goals_per_game, aes(x=year, y = goals_per_game)) +
  geom_line() +
  labs(x="Year", y="", title="Average number of goals per game") +
  scale_x_continuous(breaks = seq(1870, 2020, 10)) 

#Review 2
# # Best performing teams during soccer history
# 
# ## Which teams were consistent high scorer and good defender across time?
# 
# We have seen how games became globally less prolific in goals, but what about teams? Did some teams always scored a lot or, at contrary, were some always great defenders? 
# First, let's transform a bit the data for this purpose. We will now have two entries per game, one from the perspective of each team.
games_info_home <- function(v) {
  team1 = v["home_team"]
  team1_gf <- v["home_score"]
  team1_ga <- v["away_score"]
  team1_outcome <-  "D"
  if (team1_gf > team1_ga) {team1_outcome <- "W"}
  if (team1_gf < team1_ga) {team1_outcome <- "L"}
  
  res1 <- c(v["date"], v["year"], v["tournament"], team1, v["away_team"],team1_gf, team1_ga, team1_outcome, "H")
  return(res1)
}
games_info_away <- function(v) {
  team2 = v["away_team"]
  team2_gf <- v["away_score"]
  team2_ga <- v["home_score"]
  team2_outcome <-  "D"
  if (team2_gf > team2_ga) {team2_outcome <- "W"}
  if (team2_gf < team2_ga) {team2_outcome <- "L"}
  
  res2 <- c(v["date"], v["year"], v["tournament"], team2, v["home_team"],team2_gf, team2_ga, team2_outcome, "A")
  
  return(res2)
  
}
df_teams_games_home <- t(apply(df, 1, games_info_home))
df_teams_games_away <- t(apply(df, 1, games_info_away))
df_teams_games <- rbind(df_teams_games_home, df_teams_games_away)

colnames(df_teams_games) <- c("date", "year", "tournament", "team", "opponent", "team_score", "opponent_score", "team_outcome", "where")

df_teams_games <- as.data.frame(df_teams_games) %>%
  mutate(date=as.Date(date),
         year = as.numeric(as.character(year))) %>%
  mutate(team_score = as.numeric(as.character(team_score)),
         opponent_score = as.numeric(as.character(opponent_score))) %>%
  arrange(date)

head(df_teams_games, 10)
##          date year tournament     team opponent team_score opponent_score
## 1  1872-11-30 1872   Friendly Scotland  England          0              0
## 2  1872-11-30 1872   Friendly  England Scotland          0              0
## 3  1873-03-08 1873   Friendly  England Scotland          4              2
## 4  1873-03-08 1873   Friendly Scotland  England          2              4
## 5  1874-03-07 1874   Friendly Scotland  England          2              1
## 6  1874-03-07 1874   Friendly  England Scotland          1              2
## 7  1875-03-06 1875   Friendly  England Scotland          2              2
## 8  1875-03-06 1875   Friendly Scotland  England          2              2
## 9  1876-03-04 1876   Friendly Scotland  England          3              0
## 10 1876-03-04 1876   Friendly  England Scotland          0              3
##    team_outcome where
## 1             D     H
## 2             D     A
## 3             W     H
## 4             L     A
## 5             W     H
## 6             L     A
## 7             D     H
## 8             D     A
## 9             W     H
## 10            L     A
df_teams_goals_per_year <- df_teams_games %>%
  
  group_by(team, year) %>%
  summarise(gf_per_game = sum(team_score) / length(date),
            ga_per_game = sum(opponent_score) / length(date),
            total_games = length(date))
## `summarise()` has grouped output by 'team'. You can override using the
## `.groups` argument.
head(df_teams_goals_per_year, 10)
## # A tibble: 10 × 5
## # Groups:   team [2]
##    team         year gf_per_game ga_per_game total_games
##    <chr>       <dbl>       <dbl>       <dbl>       <int>
##  1 Abkhazia     2012       0.5         2               2
##  2 Abkhazia     2014       1.2         1.2             5
##  3 Abkhazia     2016       3           0.2             5
##  4 Abkhazia     2017       1           1.2             5
##  5 Abkhazia     2018       2.5         0.667           6
##  6 Abkhazia     2019       1.2         0.6             5
##  7 Afghanistan  1941       1           3               1
##  8 Afghanistan  1950       0           4               1
##  9 Afghanistan  1975       0.5         3               6
## 10 Afghanistan  1976       0.667       0.667           3
# When filtering out teams with less tahn 25 games, the name of the most prolific teams overall might surprise you. The top 4 greatest scorer are small oceanian teams such as New Caledonia, Tahiti, Papua New Guinea or Fiji. This is likely explained by these teams mostly competing against other "local" teams in more open games. Amongst the most "conventional" soccer nations, Germany, England and Brazil make it to the podim with, respectively, 2.25, 2.19 and 2.19 goals per game in average during their history.
df_teams_goals_overall <- df_teams_games %>%
  
  group_by(team) %>%
  summarise(gf_per_game = sum(team_score) / length(date),
            ga_per_game = sum(opponent_score) / length(date),
            total_games = length(date))

top10_attack <- head(df_teams_goals_overall %>% filter(total_games > 25) %>% arrange(desc(gf_per_game)), 10) %>% select(team, gf_per_game, total_games)
top10_attack
## # A tibble: 10 × 3
##    team             gf_per_game total_games
##    <chr>                  <dbl>       <int>
##  1 Sápmi                   3.23          26
##  2 Isle of Man             3.18          49
##  3 Northern Cyprus         2.88          34
##  4 Padania                 2.74          43
##  5 Gotland                 2.7           30
##  6 Basque Country          2.69          58
##  7 Tahiti                  2.66         213
##  8 New Caledonia           2.64         237
##  9 Isle of Wight           2.41          44
## 10 Papua New Guinea        2.34         122
# The top defenses also offer some surprises. Iran and Morocco have the best defenses with 0.82 and 0.85 goals in average during around 500 games! Spain and Brazil make it to the top 5. Italy, the mother nation of [the Catenaccio](https://en.wikipedia.org/wiki/Catenaccio) closes the top 10.
top10_defense <- head(df_teams_goals_overall %>% filter(total_games > 25) %>% arrange(ga_per_game), 10) %>% select(team, ga_per_game, total_games)
top10_defense
## # A tibble: 10 × 3
##    team            ga_per_game total_games
##    <chr>                 <dbl>       <int>
##  1 Padania               0.767          43
##  2 Iran                  0.798         525
##  3 Morocco               0.838         580
##  4 Abkhazia              0.857          28
##  5 Spain                 0.892         733
##  6 Brazil                0.894        1021
##  7 South Korea           0.902         905
##  8 Northern Cyprus       0.912          34
##  9 Iraq                  0.948         577
## 10 Italy                 0.961         838
# If we look at what happened since 1980 only, the picture only changes slightly.
# Top scorer teams are still from Oceania. 
# Amongst the best defenses, 6 of the top 10 teams are now from Europe (including all teams from the top 3) and the number of goal against per game has dropped bewteen 0.73 and 0.88. brazil, considered as a very offensive team, still makes it to the top 10.
df_teams_games %>%
  filter(year > 1980) %>%
  group_by(team) %>%
  summarise(gf_per_game = sum(team_score) / length(date),
            ga_per_game = sum(opponent_score) / length(date),
            total_games = length(date)) %>%
  filter(total_games > 25) %>%
  arrange(desc(gf_per_game)) %>%
  head(10) %>% 
  select(team, gf_per_game, total_games)
## # A tibble: 10 × 3
##    team            gf_per_game total_games
##    <chr>                 <dbl>       <int>
##  1 Sápmi                  3.23          26
##  2 Isle of Man            3.18          49
##  3 Northern Cyprus        3.16          31
##  4 Padania                2.74          43
##  5 Gotland                2.7           30
##  6 New Caledonia          2.60         152
##  7 Isle of Wight          2.41          44
##  8 Basque Country         2.39          31
##  9 Tahiti                 2.38         141
## 10 Fiji                   2.26         186
df_teams_games %>%
  filter(year > 1980) %>%
  group_by(team) %>%
  summarise(gf_per_game = sum(team_score) / length(date),
            ga_per_game = sum(opponent_score) / length(date),
            total_games = length(date)) %>%
  filter(total_games > 25) %>%
  arrange(ga_per_game) %>%
  head(10) %>% select(team, ga_per_game, total_games)
## # A tibble: 10 × 3
##    team            ga_per_game total_games
##    <chr>                 <dbl>       <int>
##  1 Brazil                0.723         599
##  2 England               0.734         492
##  3 Northern Cyprus       0.742          31
##  4 Morocco               0.748         441
##  5 Spain                 0.759         490
##  6 Iran                  0.760         438
##  7 Padania               0.767          43
##  8 France                0.770         492
##  9 Italy                 0.781         480
## 10 Netherlands           0.829         451
# Let's look at how the defense and offense skills of these teams have evolved through time.
# 
# Some of the best socring teams are on a declining trend, such as Hungary, Tahiti or Papua New Guinea. However, other teams such as Germany, Brazil or Fiji are very stable, which is remakable as, as seen before, the overall number of goals per game is decreasing.
# 
# The best defending teams are following the global trend of games delivering less goals and are generallty taking less goals too.
ggplot(top10_attack %>% select(team) %>% left_join(df_teams_goals_per_year, by="team"),
       aes(x=year, y=gf_per_game, colour=team)) +
  geom_line() +
  facet_wrap(~team) +
  labs(x="Year", y="Goal scored per game") +
  guides(colour=FALSE) +
  geom_smooth(method="lm")
## Warning in left_join(., df_teams_goals_per_year, by = "team"): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 1 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
##   warning.
## `geom_smooth()` using formula = 'y ~ x'

ggplot(top10_defense %>% select(team) %>% left_join(df_teams_goals_per_year, by="team"),
       aes(x=year, y=ga_per_game, colour=team)) +
  geom_line() +
  facet_wrap(~team) +
  labs(x="Year", y="Goal against per game") +
  guides(colour=FALSE) +
  geom_smooth(method="lm")
## Warning in left_join(., df_teams_goals_per_year, by = "team"): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 1 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
##   warning.
## `geom_smooth()` using formula = 'y ~ x'

# ## Defense and attack per decade
# 
# We have looked at defense and attack overall but it is very likely that the best defending and attacking countries haven't always been the same. So let's break this down by decade.
df_teams_goals_per_decade <- df_teams_games %>%
  mutate(decade = cut(year, seq(1870,2020, 10), dig.lab = 4, right=FALSE)) %>%
  group_by(team, decade) %>%
  summarise(gf_per_game = sum(team_score) / length(date),
            ga_per_game = sum(opponent_score) / length(date),
            total_games = length(date),
            min_year = min(year)) %>%
  ungroup() %>%
  group_by(decade) %>%
  mutate(min_year = min(min_year),
         decade_year = paste(min_year, "'s", sep=""))
## `summarise()` has grouped output by 'team'. You can override using the
## `.groups` argument.
#group_by(decade) %>%
#top_n(n=6, wt=winrate) %>%
#ungroup() %>%
#arrange(desc(decade), desc(winrate)) %>%
#mutate(ord = rev(row_number())) %>%
#mutate(decade_year = paste(min_year, "'s", sep=""))
df_teams_goals_per_decade_top_gf <- df_teams_goals_per_decade %>%
  group_by(decade_year) %>%
  filter(total_games > 10) %>%
  top_n(n=6, wt=gf_per_game) %>%
  ungroup() %>%
  arrange(desc(decade_year), desc(gf_per_game)) %>%
  mutate(ord = rev(row_number()))
# Best scoring teams have changed quite a lot through the different decades in soccer history. Some of the lessons we can learn are:
# 
# * Scotland once, was one of the top scoring nations (OK, that was when max 10 teams were competing, but still) and slowly dropped from the top 6.
# * Sweden was consistently in the top 6 for 4 decades in a row (1910s to 1940s).
# * Fiji and Tahiti were at the top of the charts during some decades too, including some recent ones.
# * Zambia and China once were among the top scorers.
# * During the last 3 decades, Germany and Spain are the only major nations who made it twice to the top 6.
ggplot(df_teams_goals_per_decade_top_gf, aes(x=ord, y=gf_per_game, fill=team)) + 
  geom_bar(stat="identity") +
  facet_wrap(~decade_year, scales="free_y") +
  coord_flip() +
  scale_x_continuous(labels=df_teams_goals_per_decade_top_gf$team, 
                     breaks=df_teams_goals_per_decade_top_gf$ord) +
  labs(x="", y="Goals scored per game") +
  guides(fill=FALSE)

# Let's look at defenses now. Here is what we can see:
# * Scotland also used to have a good defense.
# * England and Germany were solid during the 1930's and 1940's.
# * China and Tahiti were amongst the best defenses between the 1960's and 1980's.
# * Despite of being seen as an offensive team, Brazil was #1 and #3 best defense in the 1980's and 1990's.
# * Germany was the second best defense two decades in a row (2000's and 2010's)
df_teams_goals_per_decade_top_ga <- df_teams_goals_per_decade %>%
  group_by(decade_year) %>%
  filter(total_games > 10) %>%
  top_n(n=6, wt=gf_per_game) %>%
  ungroup() %>%
  arrange(desc(decade_year), ga_per_game) %>%
  mutate(ord = rev(row_number())) 

ggplot(df_teams_goals_per_decade_top_ga, aes(x=ord, y=ga_per_game, fill=team)) + 
  geom_bar(stat="identity") +
  facet_wrap(~decade_year, scales="free_y") +
  coord_flip() +
  scale_x_continuous(labels=df_teams_goals_per_decade_top_ga$team, 
                     breaks=df_teams_goals_per_decade_top_ga$ord) +
  labs(x="", y="Goal against per game") +
  guides(fill=FALSE)

# Are defense and attack correlated, i.e. are the top scorers also the best defense?
# Below, we can see that the teams scoring very few goals per game are also more likely to have a poorer defense. However, pat a given limit around 1.5 goals for per game, the quality of the defense remains rather constant.
# In general, teams above the line generally have a bad defense given their attack level and teams below the line have a better defense given their attack stats.
ggplot(df_teams_goals_per_decade, aes(x=gf_per_game, y=ga_per_game, colour=decade_year)) +
  geom_point() +
  geom_smooth(aes(group=1)) +
  labs(x="Goals for", y="Goals against", colour="Decade")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# Looking a decades separetely, we can see that this trend holds true most of the time, with some variations. For example, the relation was more flat in the 70's but is more pronounced in the most recent decades.
ggplot(df_teams_goals_per_decade %>% filter(min_year > 1900), aes(x=gf_per_game, y=ga_per_game, colour=decade_year)) +
  geom_point(size=0.5) +
  facet_wrap(~decade_year, scales="free") +
  geom_smooth(aes(group=1), method="loess") +
  labs(x="Goals for", y="Goals against") +
  guides(colour=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

# ## Overall, which team has the best win ratio?
# 
# Now that we have looked at attack and defense, let's move to what finally matters the most: winning. It can be seen as fair to say that the most dominating team is the one that wins the highest number of games. Let's then compute the win ratio of all teams.
# Number of games per year per team
df_team_games_per_year <- all_teams %>%
  filter(year < 2018) %>%
  group_by(teams, year) %>%
  summarise(nb_games = length(year)) %>%
  mutate(year_date=as.Date(paste(year,"-01-01",sep="")))
## `summarise()` has grouped output by 'teams'. You can override using the
## `.groups` argument.
# Number of victories per year
df_nb_victories <- df %>%
  mutate(year=as.numeric(year)) %>%
  select(year, winning_team) %>%
  filter(!is.na(winning_team)) %>%
  group_by(year, winning_team) %>%
  summarise(nb_victories = length(winning_team))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Number of losses per year
df_nb_losses <- df %>%
  mutate(year=as.numeric(year)) %>%
  select(year, losing_team) %>%
  filter(!is.na(losing_team)) %>%
  group_by(year, losing_team) %>%
  summarise(nb_losses = length(losing_team))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Putting all this together
df_teams_winrate <- df_team_games_per_year %>%
  left_join(df_nb_victories, by=c("year"="year", "teams"="winning_team")) %>%
  left_join(df_nb_losses, by=c("year", "teams"="losing_team")) %>%
  mutate(nb_victories = ifelse(is.na(nb_victories), 0, nb_victories)) %>%
  mutate(nb_losses = ifelse(is.na(nb_losses), 0, nb_losses)) %>%
  mutate(nb_ties = nb_games - (nb_victories + nb_losses))
# Let's look overall
df_teams_winrate_overall <- df_teams_winrate %>%
  group_by(teams) %>%
  summarise(nb_games = sum(nb_games),
            nb_victories = sum(nb_victories),
            nb_losses = sum(nb_losses),
            nb_ties = sum(nb_ties)) %>%
  ungroup() %>%
  mutate(winrate = nb_victories / nb_games * 100,
         lossrate = nb_losses / nb_games * 100,
         tierate = nb_ties / nb_games * 100)
# We will remove teams who played less than 10 games in total as they might have rather random win ratios (otherwise, the top 2 teams have a 100% win rate and... 1 game only).
# This time, the top teams are not a surprise: Brazil, Germany and Spain. Some teams are more surprising such as Jersey or Northern Cyprus. Together with Brazil, Argentina and Iran are the only non-European countries in this top 10. Czech Republic and Croatia also make it to this top 10.

df_teams_winrate_overall %>%
  filter(nb_games > 10) %>%
  arrange(desc(winrate)) %>%
  head(n=10)
## # A tibble: 10 × 8
##    teams           nb_games nb_victories nb_lo…¹ nb_ties winrate lossr…² tierate
##    <chr>              <int>        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 Padania               32           23       3       6    71.9    9.38   18.8 
##  2 Jersey                78           52      15      11    66.7   19.2    14.1 
##  3 Basque Country        55           36       9      10    65.5   16.4    18.2 
##  4 Brazil               957          607     156     194    63.4   16.3    20.3 
##  5 Andalusia             13            8       1       4    61.5    7.69   30.8 
##  6 Rhodes                18           11       5       2    61.1   27.8    11.1 
##  7 Northern Cyprus       28           17       7       4    60.7   25      14.3 
##  8 Germany              927          545     193     189    58.8   20.8    20.4 
##  9 Spain                670          391     128     151    58.4   19.1    22.5 
## 10 Isle of Man           45           26      15       4    57.8   33.3     8.89
## # … with abbreviated variable names ¹​nb_losses, ²​lossrate
df_teams_winrate_overall_mold <-  df_teams_winrate_overall %>%
  filter(nb_games > 10) %>%
  arrange(desc(winrate)) %>%
  mutate(teams = factor(teams, levels=teams[order(winrate)])) %>%
  head(n=10) %>%
  select(teams, winrate, lossrate, tierate) %>%
  melt(id.vars="teams") 

ggplot(df_teams_winrate_overall_mold, aes(x = teams, y=value, fill=variable, group=teams)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(x="", y="Percentage", fill="", title="Top 10 teams by overall win rate")

# Now that we looked at the best teams, which are the ones with the lowest win ratio? Nations with less than 10 games played are filtered out.
# 
# Without great surprise, they are mostly small nations. Kiribati is the only of those nations who never won a game and their unique draw game is from 1979 and, ironically, it is not lised in their [Wikipedia page](https://en.wikipedia.org/wiki/Kiribati_national_football_team) (but their 24-0 defeat to Fiji is).
# 

df_teams_winrate_overall %>%
  filter(nb_games > 10) %>%
  arrange(winrate) %>%
  head(n=10)
## # A tibble: 10 × 8
##    teams          nb_games nb_victories nb_los…¹ nb_ties winrate lossr…² tierate
##    <chr>             <int>        <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 Kiribati             11            0       10       1   0        90.9    9.09
##  2 San Marino          147            1      142       4   0.680    96.6    2.72
##  3 Andorra             145            4      129      12   2.76     89.0    8.28
##  4 Djibouti             83            3       76       4   3.61     91.6    4.82
##  5 Anguilla             51            3       45       3   5.88     88.2    5.88
##  6 Luxembourg          381           28      306      47   7.35     80.3   12.3 
##  7 Liechtenstein       180           14      144      22   7.78     80     12.2 
##  8 Timor-Leste          51            4       44       3   7.84     86.3    5.88
##  9 Somalia             104            9       83      12   8.65     79.8   11.5 
## 10 American Samoa       43            4       38       1   9.30     88.4    2.33
## # … with abbreviated variable names ¹​nb_losses, ²​lossrate
df %>%
  filter(away_team == "Kiribati" | home_team == "Kiribati")
## # A tibble: 11 × 15
##    date       home_team    away_…¹ home_…² away_…³ tourn…⁴ city  country neutral
##    <date>     <chr>        <chr>     <dbl>   <dbl> <chr>   <chr> <chr>   <lgl>  
##  1 1979-08-30 Fiji         Kiriba…      24       0 South … Naus… Fiji    FALSE  
##  2 1979-08-31 Kiribati     Papua …       0      13 South … Suva  Fiji    TRUE   
##  3 1979-09-05 Kiribati     Tuvalu        3       3 South … Naus… Fiji    TRUE   
##  4 2003-06-30 Tuvalu       Kiriba…       3       2 South … Suva  Fiji    TRUE   
##  5 2003-07-03 Solomon Isl… Kiriba…       7       0 South … Suva  Fiji    TRUE   
##  6 2003-07-05 Fiji         Kiriba…      12       0 South … Naus… Fiji    FALSE  
##  7 2003-07-07 Kiribati     Vanuatu       0      18 South … Laut… Fiji    TRUE   
##  8 2011-08-30 Fiji         Kiriba…       9       0 Pacifi… Boul… New Ca… TRUE   
##  9 2011-09-01 Cook Islands Kiriba…       3       0 Pacifi… Boul… New Ca… TRUE   
## 10 2011-09-03 Kiribati     Papua …       1      17 Pacifi… Boul… New Ca… TRUE   
## 11 2011-09-05 Kiribati     Tahiti        1      17 Pacifi… Boul… New Ca… TRUE   
## # … with 6 more variables: year <chr>, month <chr>, dayofweek <chr>,
## #   outcome <chr>, winning_team <chr>, losing_team <chr>, and abbreviated
## #   variable names ¹​away_team, ²​home_score, ³​away_score, ⁴​tournament
# Now that we have seen top and struggling team, let's check if some teams are best at draw games. Here again, only nations with more than 10 games were considered.
# 
# Interestingly, 8 out of the top 10 teams in term of ties are  from Africa. The numbers aren't extreme though as Angola, the top team in this ranking, has a nearly 35% tie rate, which is not far from what would be the expectation if the outcome of a game was purely random.

df_teams_winrate_overall %>%
  filter(nb_games > 10) %>%
  arrange(desc(tierate)) %>%
  head(n=10)
## # A tibble: 10 × 8
##    teams           nb_games nb_victories nb_lo…¹ nb_ties winrate lossr…² tierate
##    <chr>              <int>        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 Abkhazia              17            6       3       8    35.3   17.6     47.1
##  2 Angola               345          117     108     120    33.9   31.3     34.8
##  3 Lesotho              235           42     120      73    17.9   51.1     31.1
##  4 Andalusia             13            8       1       4    61.5    7.69    30.8
##  5 Ellan Vannin          13            5       4       4    38.5   30.8     30.8
##  6 Cameroon             527          231     135     161    43.8   25.6     30.6
##  7 Botswana             259           67     115      77    25.9   44.4     29.7
##  8 Iraqi Kurdistan       27           13       6       8    48.1   22.2     29.6
##  9 Jordan               368          127     133     108    34.5   36.1     29.3
## 10 Lebanon              253           69     110      74    27.3   43.5     29.2
## # … with abbreviated variable names ¹​nb_losses, ²​lossrate
# ## Which are the best teams per decade
# 
# That's the question we all want to see answered! It came to no surprise that Brazil or Germany have the highest win ratios, but was it always the case? Which teams dominated the different eras of football.

df_teams_winrate_per_decade <- df_teams_winrate %>%
  mutate(decade = cut(year, seq(1870,2020, 10), dig.lab = 4, right=FALSE)) %>%
  group_by(teams, decade) %>%
  summarise(nb_games = sum(nb_games),
            nb_victories = sum(nb_victories),
            nb_losses = sum(nb_losses),
            nb_ties = sum(nb_ties),
            min_year = min(year)) %>%
  ungroup() %>%
  mutate(winrate = nb_victories / nb_games * 100,
         lossrate = nb_losses / nb_games * 100,
         tierate = nb_ties / nb_games * 100) 
## `summarise()` has grouped output by 'teams'. You can override using the
## `.groups` argument.
df_teams_winrate_per_decade_cleaned <- df_teams_winrate_per_decade %>%
  filter(nb_games > 10) %>% 
  group_by(decade) %>%
  mutate(min_year = min(min_year)) %>%
  top_n(n=6, wt=winrate) %>%
  ungroup() %>%
  arrange(desc(decade), desc(winrate)) %>%
  mutate(ord = rev(row_number())) 
# Here are some interesting findings we can collect form the plot below:
# 
# * England used to be on the top teams. There were little competition at the time but there almost always in the top 6 teamsuntil the 1960's but were never back to this eleite club since then.
# * Egypt or Iran are some made it twice to the top 6 since the 1970's.
# * More or less expectedly, Argentina and Korea also are some of the teams regularly present in the top 5.
# * Brazil has consistently been in the top 5 for the last 8 decades, that's the most striking performance and a strong indicator that they have been the most regular team in the (semi-) recent history of soccer. They occupied the top spot three decades in a row, from the 1970's to the 1990's.
# * Germany can be considered as the second most regular team, making it 7 times in the top 5 in the last 9 decades. However, they never reached the first position.
# * Spain's domination in the recent history of football is clearly visible here as they have occupied the top spot of this ranking during the last two decades (although the current one is yet to be finised).
df_teams_winrate_per_decade_cleaned2 <- df_teams_winrate_per_decade %>%
  filter(nb_games > 25) %>% 
  group_by(decade) %>%
  mutate(min_year = min(min_year)) %>%
  top_n(n=6, wt=winrate) %>%
  ungroup() %>%
  arrange(desc(decade), desc(winrate)) %>%
  mutate(ord = rev(row_number())) %>%
  mutate(decade_year = paste(min_year, "'s", sep=""))
ggplot(df_teams_winrate_per_decade_cleaned2, aes(x=ord, y=winrate, fill=teams)) + 
  geom_bar(stat="identity") +
  facet_wrap(~decade_year, scales="free_y") +
  coord_flip() +
  scale_x_continuous(labels=df_teams_winrate_per_decade_cleaned2$teams, 
                     breaks=df_teams_winrate_per_decade_cleaned2$ord) +
  labs(x="", y="Win rate (%)", title="Top 6 best soccer teams per decade") +
  guides(fill=FALSE) + theme(axis.text.x=element_text(size=6))

# ## How did the hierarchy between continent evolve?
# 
# The previous plots showed that the dominating nations were often European or... Brazil. But if we look at whole continents, what happens? Are some continents really dominating?
# 
# Continents are defined as local associations such as UEFA or AFC. America is the combination of CONCACAF and CONMEBOL). Australia is part of AFC and, therefore, considered as part of Asia in this case.
# 
# From the two plots below, we can see that globally, median win ratio is similar between continents although the spread might vary a lot. For example, Europe often ahs some of the best teams but also some of the least performing (generally small states such as Andorra, Luxembourg,...). In comparison, the Americas, generally show a more homogeneous distribution.
# Oceania has apparently a high win rate but there are mostly small teams competing between each others (Australia is part of the AFC, i.e. Asia).
# Africa, seems to be the poorest performing continent overall, at least in the last decades.

#df_teams_winrate_per_decade_cleaned2_per_continent <- df_teams_winrate_per_decade %>%
  # filter(nb_games > 25) %>% 
  # group_by(decade) %>%
  # mutate(min_year = min(min_year)) %>%
  # ungroup() %>%
  # arrange(desc(decade), desc(winrate)) %>%
  # mutate(ord = rev(row_number())) %>%
  # mutate(decade_year = paste(min_year, "'s", sep=""))
# ggplot(df_teams_winrate_per_decade_cleaned2_per_continent, aes(x=continent, y=winrate, colour=continent)) +
#   geom_jitter(position=position_jitter(0.2), size=0.5) +
#   facet_wrap(~ decade_year, scales = "free") +
#   coord_flip() +
#   stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
#                geom = "crossbar", size = 0.2, width=0.2, colour="black") +
#   labs(x="", y="Win rate (%)") +
#   guides(colour=FALSE)
# %% [code]
#p <- ggplot(df_teams_winrate_per_decade_cleaned2_per_continent, aes(x=continent, y=winrate, colour=continent)) +
#geom_jitter(position=position_jitter(0.2), size=0.5, aes(text=teams)) +
#facet_wrap(~ decade_year, scales = "free") +
#coord_flip() +
#stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
#                 geom = "crossbar", size = 0.2, width=0.2, colour="black") +
#labs(x="", y="Win rate (%)") +
#guides(colour=FALSE)

#ggplotly(p)
# df_teams_median_winrate_per_decade_per_continent <- df_teams_winrate_per_decade_cleaned2_per_continent %>%
#   group_by(continent, min_year, decade_year) %>%
  # summarise(median = median(winrate))
# ggplot(df_teams_median_winrate_per_decade_per_continent, aes(x=min_year, y=median, group=continent, colour=continent)) +
#   geom_line() +
#   labs(x="Decade", title="Median win rate", colour="Continent", y="%")
# ## Identifying streaks
# 
# Another way to look at domination at a more fine-grained level is to look at winning streaks, in particular between two different teams.

# First quick look at the data, which are the most common games?
# 
# The most common games oppose neighbouring countries. This makes sense as in the early history of soccer, it was more convenient to play against near by countries.
df_streaks <- df_teams_games %>%
  arrange(team, opponent, date)
get_first <- function(df) {return(df[1,])}
df_streaks %>%
  mutate(team = as.character(team), opponent = as.character(opponent)) %>%
  group_by(team, opponent) %>%
  summarise(n = length(team)) %>%
  rowwise() %>%
  mutate(key = paste(min(c(team, opponent)), max(c(team, opponent)), sep="vs")) %>%
  ungroup() %>%
  group_by(key) %>%
  do(head(.,1)) %>%
  ungroup() %>%
  arrange(desc(n)) %>%
  head(10)
## `summarise()` has grouped output by 'team'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 4
##    team      opponent        n key                 
##    <chr>     <chr>       <int> <chr>               
##  1 Argentina Uruguay       179 ArgentinavsUruguay  
##  2 Austria   Hungary       137 AustriavsHungary    
##  3 Belgium   Netherlands   127 BelgiumvsNetherlands
##  4 England   Scotland      117 EnglandvsScotland   
##  5 Kenya     Uganda        110 KenyavsUganda       
##  6 Norway    Sweden        109 NorwayvsSweden      
##  7 Argentina Brazil        108 ArgentinavsBrazil   
##  8 Denmark   Sweden        107 DenmarkvsSweden     
##  9 Scotland  Wales         106 ScotlandvsWales     
## 10 Argentina Paraguay      105 ArgentinavsParaguay
# Let's now identify, which teams have been consistently dominating one of their opponent. I define as streak a series of at least 6 games during which a given team never lost (draws are possible).
# Argentina had the longest streak ever observed against Chile. During 49 years and 35 games, Chile never won against Argentina. However, Chile can still brag of its series of 19 undefeated games against Ecuador.
# Some of these streaks are relatively old but some also ended during the last years (or are still ongoing) and they generally gather teams of the same continent, or even neighbouring countries.

## This chunk is a bit slow, I should see if there are ways to speed it up. 
## Suggestions are welcome
## At the moment, the output is cached (as a dataset) to avoid regenerating it each time.

## TODO:
### Flag streaks which are still ongoing
# extract_streaks <- function(dfs, lvls, min_streak = 6, min_games = 6) {
#   #print(dfs)
#   if (length(dfs$team_outcome) >= min_games) {
#     outcomes <- paste(dfs$team_outcome, collapse="")
#     streaks <- strsplit(outcomes, "L")[[1]]
#     streaks_length <- nchar(streaks)
#     last_longest_streak <- max(which(streaks_length == max(streaks_length)))
#     last_longest_streak_length = max(streaks_length)
#     if (last_longest_streak_length >= min_streak) {
#       streak_begin <- sum(streaks_length[1:last_longest_streak]) + last_longest_streak - last_longest_streak_length
#       streak_end <- streak_begin + last_longest_streak_length - 1
#       streak_df <- dfs[streak_begin:streak_end,]
#       streak_date_start = as.character(streak_df[1,"date"])
#       streak_date_end = as.character(streak_df[last_longest_streak_length,"date"])
#       
#       rm(dfs)
#       gc()
#       
#       if(length(streak_date_start) > 0 & length(streak_date_end) > 0 & last_longest_streak_length > 0) {
#         res <- data.frame(start_date = factor(streak_date_start, levels=lvls), end_date = factor(streak_date_end, levels=lvls), len = last_longest_streak_length)
#         return(res)
#       } else {
#         return(data.frame(start_date = factor(character(0), levels=lvls), end_date = factor(character(0), levels=lvls), len = numeric(0)))
#       }
#     } else {
#       return(data.frame(start_date = factor(character(0), levels=lvls), end_date = factor(character(0), levels=lvls), len = numeric(0)))
#     }
#   } else {
#     return(data.frame(start_date = factor(character(0), levels=lvls), end_date = factor(character(0), levels=lvls), len = numeric(0)))
#   }
#   
#   
# }
# 
# save_file = "../input/international-soccer-games-streaks/df_top_streak.RData"
# if (!file.exists(save_file)) {
#   df_top_streak <- df_streaks %>%
#     group_by(team, opponent) %>%
#     do(extract_streaks(., lvls = unique(df_streaks$date)))
#   save(df_top_streak, file=save_file)
# } else {
#   load(save_file)
# }
# 
# df_top_streak %>%
#   arrange((desc(len))) %>%
#   head(20)
# Below are listed all the games from the Argentina's undefeated streak against Chile.
## Checking one individual example
df_streaks %>%
  filter(team == "Argentina" & opponent == "Chile" & date >= as.Date("1910-05-27")  & date <= as.Date("1959-03-07"))
##          date year                   tournament      team opponent team_score
## 1  1910-05-27 1910                     Friendly Argentina    Chile          3
## 2  1910-06-05 1910                     Friendly Argentina    Chile          5
## 3  1910-09-11 1910                     Friendly Argentina    Chile          3
## 4  1913-09-21 1913                     Friendly Argentina    Chile          2
## 5  1916-07-06 1916                 Copa América Argentina    Chile          6
## 6  1916-07-12 1916                     Friendly Argentina    Chile          1
## 7  1917-10-06 1917                 Copa América Argentina    Chile          1
## 8  1919-05-22 1919                 Copa América Argentina    Chile          4
## 9  1920-09-20 1920                 Copa América Argentina    Chile          1
## 10 1922-09-28 1922                 Copa América Argentina    Chile          4
## 11 1922-10-22 1922                     Friendly Argentina    Chile          1
## 12 1924-10-25 1924                 Copa América Argentina    Chile          2
## 13 1926-10-31 1926                 Copa América Argentina    Chile          1
## 14 1930-07-22 1930               FIFA World Cup Argentina    Chile          3
## 15 1935-01-06 1935                 Copa América Argentina    Chile          4
## 16 1936-12-30 1936                 Copa América Argentina    Chile          2
## 17 1940-03-02 1940                     Friendly Argentina    Chile          4
## 18 1940-03-09 1940                     Friendly Argentina    Chile          3
## 19 1941-01-05 1941                     Friendly Argentina    Chile          2
## 20 1941-01-09 1941                     Friendly Argentina    Chile          5
## 21 1941-03-04 1941                 Copa América Argentina    Chile          1
## 22 1942-01-31 1942                 Copa América Argentina    Chile          0
## 23 1945-02-11 1945                 Copa América Argentina    Chile          1
## 24 1946-01-26 1946                 Copa América Argentina    Chile          3
## 25 1947-12-16 1947                 Copa América Argentina    Chile          1
## 26 1955-03-30 1955                 Copa América Argentina    Chile          1
## 27 1956-01-29 1956                 Copa América Argentina    Chile          2
## 28 1956-03-11 1956    Pan American Championship Argentina    Chile          3
## 29 1957-03-28 1957                 Copa América Argentina    Chile          6
## 30 1957-10-13 1957 FIFA World Cup qualification Argentina    Chile          2
## 31 1957-10-20 1957 FIFA World Cup qualification Argentina    Chile          4
## 32 1959-03-07 1959                 Copa América Argentina    Chile          6
##    opponent_score team_outcome where
## 1               1            W     H
## 2               1            W     H
## 3               0            W     A
## 4               0            W     A
## 5               1            W     H
## 6               0            W     H
## 7               0            W     H
## 8               1            W     H
## 9               1            D     A
## 10              0            W     H
## 11              0            W     H
## 12              0            W     H
## 13              1            D     A
## 14              1            W     H
## 15              1            W     H
## 16              1            W     H
## 17              1            W     H
## 18              2            W     H
## 19              1            W     A
## 20              2            W     A
## 21              0            W     A
## 22              0            D     H
## 23              1            D     A
## 24              1            W     H
## 25              1            D     H
## 26              0            W     A
## 27              0            W     H
## 28              0            W     H
## 29              2            W     H
## 30              0            W     A
## 31              0            W     H
## 32              1            W     H
# # Bonus track:  Evolution of intercontinental games
# 
# Earlier, we saw how different teams and continent start to engage in international games. However, teams tend to start playing against their neighbours. So, when did inter-continental games start and, by extension, soccer became an intercontinental game.
# 
# Games are considered as intercontinental if they oppose two teams from different associations (e.g AFC and UEFA). Australia is part of the AFC but was relocated to Oceania for this analysis. The American continent is the union of the CONMEBOL and CONCACAF associations. 

# We add some features to our data frame so that we can know whether a game is intercontinental.
# df_teams_games_extended <- df_teams_games %>%
#   filter(where=="H") %>%
#   #inner_join(df_federations %>% select(country, continent), by=c("team"="country")) %>%
#   #inner_join(df_federations %>% select(country, continent), by=c("opponent"="country")) %>%
#   rename(continent_home = continent.x,
#          continent_away = continent.y) %>%
#   mutate(continent_home = ifelse(team=="Australia", "Oceania", continent_home),
#          continent_away = ifelse(opponent=="Australia", "Oceania", continent_away)) %>%
#   rowwise() %>%
#   mutate(intercontinental = (continent_home != continent_away)) %>%
#   ungroup()
# 
# tail(df_teams_games_extended)
# The first intercontinental game occurred in 1888 and opposed Scotland to Canada (4-0), 16 years after the very first international game. Notably, Scotland was already involved in the first international game. The next two other intercontinental games happened way later, in 1916 when, in the span of two weeks, the USA played against Sweden and Norway. Between the 1920's and 1940's, intercontinental games became more regular though still sparse (some years didn't count any).
# The end of WWII will mark the beginning of global soccer as, since 1946, there were at least one intercontinental game per year. The percentage of intercontinental game kept increasing until the 1990's, when, in average, about 15% of the games opposed teams of different continent.
# Since then, there ha been a slight decrease of intercontinental games.
# 
# There are some visible peaks of interncontinental games which coincide with World Cup years. This makes sense as, by design, teams will face teams of other continents and also take profit of pre-competition friendly games to gauge their level against a wide range of teams from different continents.
# df_intercontinental_games_per_year <- df_teams_games_extended %>%
#   filter(year < 2018) %>%
#   group_by(year) %>%
#   summarise(nb_inter = sum(intercontinental),
#             perc_inter = nb_inter / length(intercontinental) * 100) %>%
#   ungroup() %>%
#   mutate(worldcup_year = year %in% wc_years)
# ggplot(df_intercontinental_games_per_year, aes(x=year, y=perc_inter)) +
#   geom_line() +
#   geom_point(data = df_intercontinental_games_per_year %>% filter(worldcup_year), aes(colour=worldcup_year)) +
#   geom_smooth(method="loess") +
#   labs(x="Year", title="% of intercontinental games", y="%", colour="World cup year?") +
#   scale_x_continuous(breaks = seq(1870,2020,10))
### When did the first intercontinental games happen?
# df_teams_games_extended %>% filter(intercontinental) %>% head(5)
# Let's have a closer look and see what happens at a continent level.
# 
# Oceania has been the continent most often involved in intercontinental games. In the early years it is due to many games being organised between New Zealand, Australia, India. Canada and South Africa, all Commonwealth countries. In the latest years, this likely due to the fact that Australia is affiliated to the AFC and then plays qualification rounds against Asian teams.
# Europe has for a long time being the continent the least involved in intercontinental games, this is consistent with this continent hosting most of the oldest soccer teams.
# Africa was initially involved in many intercontinental games but is now the continent whose teams travel the least.
# Americ and Asia have similar trajectories: they used to play many games against teams from other continents but now are playing much more often against "local" opponents.
# df_intercontinental_games_per_year_per_continent <- df_teams_games_extended %>%
#   filter(year < 2018) %>%
#   group_by(year, continent_home) %>%
#   summarise(nb_inter = sum(intercontinental),
#             perc_inter = nb_inter / length(intercontinental) * 100)
# 
# ggplot(df_intercontinental_games_per_year_per_continent, aes(x=year, y=perc_inter, group=continent_home, colour=continent_home)) +
#   geom_smooth(method="loess") +
#   labs(x="Year", title="% of intercontinental games", y="%", colour="Team's continent") +
#   scale_x_continuous(breaks = seq(1870,2020,10))
# List of early games involving teams from Oceania
# df_teams_games_extended %>%
#   filter(( continent_home == "Oceania" | continent_away == "Oceania") & year >= 1922 & year < 1950)
# Looking at how Australia is responisible for the high number of intercontinental games for Oceania.
# df_intercontinental_games_per_year_oceania <- df_teams_games_extended %>%
#   filter(continent_home == "Oceania" | continent_away == "Oceania") %>%
#   mutate(isAustralia = (team == "Australia" | opponent == "Australia")) %>%
#   filter(year < 2018) %>%
#   group_by(year, isAustralia) %>%
#   summarise(nb_inter = sum(intercontinental),
#             perc_inter = nb_inter / length(intercontinental) * 100)
# ggplot(df_intercontinental_games_per_year_oceania, aes(x=year, y=perc_inter, group=isAustralia, colour=isAustralia)) +
#   geom_smooth(method="loess") +
#   scale_x_continuous(breaks = seq(1870,2020,10)) +
#   labs(x="Year", title="% of intercontinental games\nOceania only", y="%", colour="Team's continent")